import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
# To impute missing values
from sklearn.impute import KNNImputer
# Libtune to tune model, get different metric scores
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
data = pd.read_csv('BankChurners.csv')
data.head()
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | ... | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 768805383 | Existing Customer | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | ... | 1 | 3 | 12691.0 | 777 | 11914.0 | 1.335 | 1144 | 42 | 1.625 | 0.061 |
| 1 | 818770008 | Existing Customer | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | ... | 1 | 2 | 8256.0 | 864 | 7392.0 | 1.541 | 1291 | 33 | 3.714 | 0.105 |
| 2 | 713982108 | Existing Customer | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | ... | 1 | 0 | 3418.0 | 0 | 3418.0 | 2.594 | 1887 | 20 | 2.333 | 0.000 |
| 3 | 769911858 | Existing Customer | 40 | F | 4 | High School | NaN | Less than $40K | Blue | 34 | ... | 4 | 1 | 3313.0 | 2517 | 796.0 | 1.405 | 1171 | 20 | 2.333 | 0.760 |
| 4 | 709106358 | Existing Customer | 40 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21 | ... | 1 | 0 | 4716.0 | 0 | 4716.0 | 2.175 | 816 | 28 | 2.500 | 0.000 |
5 rows × 21 columns
data.drop('CLIENTNUM', inplace = True, axis = 1) #dropping the first column as valuable information wont be gained
data.shape #shape of the data
(10127, 20)
data.info() #data type info of every variable
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Attrition_Flag 10127 non-null object 1 Customer_Age 10127 non-null int64 2 Gender 10127 non-null object 3 Dependent_count 10127 non-null int64 4 Education_Level 8608 non-null object 5 Marital_Status 9378 non-null object 6 Income_Category 10127 non-null object 7 Card_Category 10127 non-null object 8 Months_on_book 10127 non-null int64 9 Total_Relationship_Count 10127 non-null int64 10 Months_Inactive_12_mon 10127 non-null int64 11 Contacts_Count_12_mon 10127 non-null int64 12 Credit_Limit 10127 non-null float64 13 Total_Revolving_Bal 10127 non-null int64 14 Avg_Open_To_Buy 10127 non-null float64 15 Total_Amt_Chng_Q4_Q1 10127 non-null float64 16 Total_Trans_Amt 10127 non-null int64 17 Total_Trans_Ct 10127 non-null int64 18 Total_Ct_Chng_Q4_Q1 10127 non-null float64 19 Avg_Utilization_Ratio 10127 non-null float64 dtypes: float64(5), int64(9), object(6) memory usage: 1.5+ MB
data.isnull().sum().sort_values(ascending = False) #null values
Education_Level 1519 Marital_Status 749 Avg_Utilization_Ratio 0 Total_Ct_Chng_Q4_Q1 0 Customer_Age 0 Gender 0 Dependent_count 0 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Attrition_Flag 0 dtype: int64
pd.DataFrame(data={'% of Missing Values':round(data.isna().sum()/data.isna().count()*100,2)}) #null values as a percentage
| % of Missing Values | |
|---|---|
| Attrition_Flag | 0.0 |
| Customer_Age | 0.0 |
| Gender | 0.0 |
| Dependent_count | 0.0 |
| Education_Level | 15.0 |
| Marital_Status | 7.4 |
| Income_Category | 0.0 |
| Card_Category | 0.0 |
| Months_on_book | 0.0 |
| Total_Relationship_Count | 0.0 |
| Months_Inactive_12_mon | 0.0 |
| Contacts_Count_12_mon | 0.0 |
| Credit_Limit | 0.0 |
| Total_Revolving_Bal | 0.0 |
| Avg_Open_To_Buy | 0.0 |
| Total_Amt_Chng_Q4_Q1 | 0.0 |
| Total_Trans_Amt | 0.0 |
| Total_Trans_Ct | 0.0 |
| Total_Ct_Chng_Q4_Q1 | 0.0 |
| Avg_Utilization_Ratio | 0.0 |
data.nunique() #unique values within each variable
Attrition_Flag 2 Customer_Age 45 Gender 2 Dependent_count 6 Education_Level 6 Marital_Status 3 Income_Category 6 Card_Category 4 Months_on_book 44 Total_Relationship_Count 6 Months_Inactive_12_mon 7 Contacts_Count_12_mon 7 Credit_Limit 6205 Total_Revolving_Bal 1974 Avg_Open_To_Buy 6813 Total_Amt_Chng_Q4_Q1 1158 Total_Trans_Amt 5033 Total_Trans_Ct 126 Total_Ct_Chng_Q4_Q1 830 Avg_Utilization_Ratio 964 dtype: int64
#Making a list of all catrgorical variables
cat_col=['Attrition_Flag', 'Gender','Dependent_count', 'Education_Level', 'Marital_Status',
'Income_Category', 'Card_Category', "Total_Relationship_Count", "Months_Inactive_12_mon","Contacts_Count_12_mon"]
#Printing number of count of each unique value in each column
for column in cat_col:
print(data[column].value_counts())
print('-'*50)
Existing Customer 8500 Attrited Customer 1627 Name: Attrition_Flag, dtype: int64 -------------------------------------------------- F 5358 M 4769 Name: Gender, dtype: int64 -------------------------------------------------- 3 2732 2 2655 1 1838 4 1574 0 904 5 424 Name: Dependent_count, dtype: int64 -------------------------------------------------- Graduate 3128 High School 2013 Uneducated 1487 College 1013 Post-Graduate 516 Doctorate 451 Name: Education_Level, dtype: int64 -------------------------------------------------- Married 4687 Single 3943 Divorced 748 Name: Marital_Status, dtype: int64 -------------------------------------------------- Less than $40K 3561 $40K - $60K 1790 $80K - $120K 1535 $60K - $80K 1402 abc 1112 $120K + 727 Name: Income_Category, dtype: int64 -------------------------------------------------- Blue 9436 Silver 555 Gold 116 Platinum 20 Name: Card_Category, dtype: int64 -------------------------------------------------- 3 2305 4 1912 5 1891 6 1866 2 1243 1 910 Name: Total_Relationship_Count, dtype: int64 -------------------------------------------------- 3 3846 2 3282 1 2233 4 435 5 178 6 124 0 29 Name: Months_Inactive_12_mon, dtype: int64 -------------------------------------------------- 3 3380 2 3227 1 1499 4 1392 0 399 5 176 6 54 Name: Contacts_Count_12_mon, dtype: int64 --------------------------------------------------
#Converting the data type of each categorical variable to 'category'
for column in cat_col:
data[column]=data[column].astype('category')
data['Income_Category'] = data['Income_Category'].replace('abc', np.nan) #converting abc value into none values as these vales are by mistake
print(data['Income_Category'].value_counts())
print('------')
print(f"Missing Values for Income Category:{(data['Income_Category'].isnull().sum())} values")
Less than $40K 3561 $40K - $60K 1790 $80K - $120K 1535 $60K - $80K 1402 $120K + 727 Name: Income_Category, dtype: int64 ------ Missing Values for Income Category:1112 values
data.isnull().sum().sort_values(ascending = False)
Education_Level 1519 Income_Category 1112 Marital_Status 749 Avg_Utilization_Ratio 0 Total_Ct_Chng_Q4_Q1 0 Customer_Age 0 Gender 0 Dependent_count 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Attrition_Flag 0 dtype: int64
# data['Education_Level'] = data['Education_Level'].astype(str).replace('nan', 'is_missing').astype('category')
# data['Income_Category'] = data['Income_Category'].astype(str).replace('nan', 'is_missing').astype('category')
# data['Marital_Status'] = data['Marital_Status'].astype(str).replace('nan', 'is_missing').astype('category')
data.isnull().sum().sort_values(ascending = False)
Education_Level 1519 Income_Category 1112 Marital_Status 749 Avg_Utilization_Ratio 0 Total_Ct_Chng_Q4_Q1 0 Customer_Age 0 Gender 0 Dependent_count 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Attrition_Flag 0 dtype: int64
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Attrition_Flag 10127 non-null category 1 Customer_Age 10127 non-null int64 2 Gender 10127 non-null category 3 Dependent_count 10127 non-null category 4 Education_Level 8608 non-null category 5 Marital_Status 9378 non-null category 6 Income_Category 9015 non-null category 7 Card_Category 10127 non-null category 8 Months_on_book 10127 non-null int64 9 Total_Relationship_Count 10127 non-null category 10 Months_Inactive_12_mon 10127 non-null category 11 Contacts_Count_12_mon 10127 non-null category 12 Credit_Limit 10127 non-null float64 13 Total_Revolving_Bal 10127 non-null int64 14 Avg_Open_To_Buy 10127 non-null float64 15 Total_Amt_Chng_Q4_Q1 10127 non-null float64 16 Total_Trans_Amt 10127 non-null int64 17 Total_Trans_Ct 10127 non-null int64 18 Total_Ct_Chng_Q4_Q1 10127 non-null float64 19 Avg_Utilization_Ratio 10127 non-null float64 dtypes: category(10), float64(5), int64(5) memory usage: 892.2 KB
data.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Customer_Age | 10127.0 | 46.325960 | 8.016814 | 26.0 | 41.000 | 46.000 | 52.000 | 73.000 |
| Months_on_book | 10127.0 | 35.928409 | 7.986416 | 13.0 | 31.000 | 36.000 | 40.000 | 56.000 |
| Credit_Limit | 10127.0 | 8631.953698 | 9088.776650 | 1438.3 | 2555.000 | 4549.000 | 11067.500 | 34516.000 |
| Total_Revolving_Bal | 10127.0 | 1162.814061 | 814.987335 | 0.0 | 359.000 | 1276.000 | 1784.000 | 2517.000 |
| Avg_Open_To_Buy | 10127.0 | 7469.139637 | 9090.685324 | 3.0 | 1324.500 | 3474.000 | 9859.000 | 34516.000 |
| Total_Amt_Chng_Q4_Q1 | 10127.0 | 0.759941 | 0.219207 | 0.0 | 0.631 | 0.736 | 0.859 | 3.397 |
| Total_Trans_Amt | 10127.0 | 4404.086304 | 3397.129254 | 510.0 | 2155.500 | 3899.000 | 4741.000 | 18484.000 |
| Total_Trans_Ct | 10127.0 | 64.858695 | 23.472570 | 10.0 | 45.000 | 67.000 | 81.000 | 139.000 |
| Total_Ct_Chng_Q4_Q1 | 10127.0 | 0.712222 | 0.238086 | 0.0 | 0.582 | 0.702 | 0.818 | 3.714 |
| Avg_Utilization_Ratio | 10127.0 | 0.274894 | 0.275691 | 0.0 | 0.023 | 0.176 | 0.503 | 0.999 |
Observations:
def histogram_boxplot(feature):
""" Boxplot and histogram combined
feature: 1-d feature array
"""
figure, (ax_box2, ax_hist2, ax_hist3) = plt.subplots(
nrows = 1, ncols=3,# Number of rows of the subplot grid= 2
figsize = (20,5)) # creating the 2 subplots
figure.tight_layout(pad = 7)
sns.boxplot(x = feature,ax=ax_box2, color = '#4B8BBE', orient = 'v') # boxplot will be created
sns.distplot(feature, kde=True, ax=ax_hist2, color = '#a9a38f') # For histogram
sns.distplot(feature, kde= True, ax=ax_hist3, hist = False) #Making an outline of the histogram
ax_hist2.axvline(np.mean(feature), color='r', linestyle='--') # Add mean to the histogram
ax_hist2.axvline(np.median(feature), color='black', linestyle='-') # Add median to the histogram
ax_hist3.axvline(np.mean(feature), color = 'black', linestyle = '--') #Adding mean to second histogram
ax_hist3.axvline(np.median(feature), color='black', linestyle='-') #Adding median to second histogram
histogram_boxplot(data['Customer_Age'])
Observations:
histogram_boxplot(data['Months_on_book'])
Observations:
histogram_boxplot(data['Credit_Limit'])
Observations:
histogram_boxplot(data['Total_Revolving_Bal'])
Observations:
histogram_boxplot(data['Avg_Open_To_Buy'])
Observations:
histogram_boxplot(data['Total_Amt_Chng_Q4_Q1'])
Observations:
histogram_boxplot(data['Total_Trans_Amt'])
Observations:
histogram_boxplot(data['Total_Trans_Ct'])
Observations:
histogram_boxplot(data['Total_Ct_Chng_Q4_Q1'])
Observation:
histogram_boxplot(data['Avg_Utilization_Ratio'])
Observations:
# Let's treat outliers by flooring and capping
def treat_outliers(df, col):
"""
treats outliers in a variable
col: str, name of the numerical variable
df: dataframe
col: name of the column
"""
Q1 = df[col].quantile(0.25) # 25th quantile
Q3 = df[col].quantile(0.75) # 75th quantile
IQR = Q3 - Q1
Lower_Whisker = Q1 - 1.5 * IQR
Upper_Whisker = Q3 + 1.5 * IQR
# all the values smaller than Lower_Whisker will be assigned the value of Lower_Whisker
# all the values greater than Upper_Whisker will be assigned the value of Upper_Whisker
df[col] = np.clip(df[col], Lower_Whisker, Upper_Whisker)
return df
def treat_outliers_all(df, col_list):
"""
treat outlier in all numerical variables
col_list: list of numerical variables
df: data frame
"""
for c in col_list:
df = treat_outliers(df, c)
return df
numerical_col = data.select_dtypes(include=np.number).columns.tolist() #converting numerical cols
data = treat_outliers_all(data, numerical_col) #treating outliers
plt.figure(figsize=(20, 30))
for i, variable in enumerate(numerical_col): #boxplots subplots
plt.subplot(5, 4, i + 1)
plt.boxplot(data[variable], whis=1.5)
plt.tight_layout()
plt.title(variable)
plt.show()
Observations:
Let's define a function to create barplots for the categorical variables indicating percentage of each category for that variables.
def bar_perc(plot, feature):
'''
plot
feature: 1-d categorical feature array
'''
total = len(feature) # length of the column
for p in ax.patches:
percentage = '{:.1f}%'.format(100 * p.get_height()/total) # percentage of each class of the category
x = p.get_x() + p.get_width() / 2 - 0.05 # width of the plot
y = p.get_y() + p.get_height() # hieght of the plot
ax.annotate(percentage, (x, y), size = 12) # annotate the percantage
data.describe(include = 'category').T
| count | unique | top | freq | |
|---|---|---|---|---|
| Attrition_Flag | 10127 | 2 | Existing Customer | 8500 |
| Gender | 10127 | 2 | F | 5358 |
| Dependent_count | 10127 | 6 | 3 | 2732 |
| Education_Level | 8608 | 6 | Graduate | 3128 |
| Marital_Status | 9378 | 3 | Married | 4687 |
| Income_Category | 9015 | 5 | Less than $40K | 3561 |
| Card_Category | 10127 | 4 | Blue | 9436 |
| Total_Relationship_Count | 10127 | 6 | 3 | 2305 |
| Months_Inactive_12_mon | 10127 | 7 | 3 | 3846 |
| Contacts_Count_12_mon | 10127 | 7 | 3 | 3380 |
Observations:
plt.figure(figsize=(20,7))
ax = sns.countplot(data[cat_col[0]]) #count plot for Name
plt.xlabel(cat_col[0])
plt.ylabel('Count')
bar_perc(ax,data[cat_col[0]])
Observations:
plt.figure(figsize=(20,7))
ax = sns.countplot(data[cat_col[1]]) #count plot for Name
plt.xlabel(cat_col[1])
plt.ylabel('Count')
bar_perc(ax,data[cat_col[1]])
Observations:
plt.figure(figsize=(20,7))
ax = sns.countplot(data[cat_col[2]]) #count plot for Name
plt.xlabel(cat_col[2])
plt.ylabel('Count')
bar_perc(ax,data[cat_col[2]])
Observations:
plt.figure(figsize=(20,7))
ax = sns.countplot(data[cat_col[3]]) #count plot for Name
plt.xlabel(cat_col[3])
plt.ylabel('Count')
bar_perc(ax,data[cat_col[3]])
Observations:
plt.figure(figsize=(20,7))
ax = sns.countplot(data[cat_col[4]]) #count plot for Name
plt.xlabel(cat_col[4])
plt.ylabel('Count')
bar_perc(ax,data[cat_col[4]])
Observations:
plt.figure(figsize=(20,7))
ax = sns.countplot(data[cat_col[5]]) #count plot for Name
plt.xlabel(cat_col[5])
plt.ylabel('Count')
bar_perc(ax,data[cat_col[5]])
Observations:
plt.figure(figsize=(20,7))
ax = sns.countplot(data[cat_col[6]]) #count plot for Name
plt.xlabel(cat_col[6])
plt.ylabel('Count')
bar_perc(ax,data[cat_col[6]])
Observations:
plt.figure(figsize=(20,7))
ax = sns.countplot(data[cat_col[7]]) #count plot for Name
plt.xlabel(cat_col[7])
plt.ylabel('Count')
bar_perc(ax,data[cat_col[7]])
Observations:
plt.figure(figsize=(20,7))
ax = sns.countplot(data[cat_col[8]]) #count plot for Name
plt.xlabel(cat_col[8])
plt.ylabel('Count')
bar_perc(ax,data[cat_col[8]])
Observations:
plt.figure(figsize=(20,7))
ax = sns.countplot(data[cat_col[9]]) #count plot for Name
plt.xlabel(cat_col[9])
plt.ylabel('Count')
bar_perc(ax,data[cat_col[9]])
Observations:
data.corr()
| Customer_Age | Months_on_book | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|
| Customer_Age | 1.000000 | 0.787989 | 0.002699 | 0.014778 | 0.000851 | -0.070923 | -0.040467 | -0.067010 | -0.027871 | 0.007108 |
| Months_on_book | 0.787989 | 1.000000 | 0.007977 | 0.008326 | 0.006756 | -0.055963 | -0.033587 | -0.049957 | -0.026631 | -0.007056 |
| Credit_Limit | 0.002699 | 0.007977 | 1.000000 | 0.046411 | 0.994238 | 0.015127 | 0.139704 | 0.068896 | -0.007484 | -0.518439 |
| Total_Revolving_Bal | 0.014778 | 0.008326 | 0.046411 | 1.000000 | -0.055839 | 0.051160 | 0.048189 | 0.056055 | 0.093643 | 0.624022 |
| Avg_Open_To_Buy | 0.000851 | 0.006756 | 0.994238 | -0.055839 | 1.000000 | 0.009522 | 0.135763 | 0.064078 | -0.017100 | -0.587529 |
| Total_Amt_Chng_Q4_Q1 | -0.070923 | -0.055963 | 0.015127 | 0.051160 | 0.009522 | 1.000000 | 0.092683 | 0.057027 | 0.347408 | 0.031923 |
| Total_Trans_Amt | -0.040467 | -0.033587 | 0.139704 | 0.048189 | 0.135763 | 0.092683 | 1.000000 | 0.860830 | 0.164096 | -0.057683 |
| Total_Trans_Ct | -0.067010 | -0.049957 | 0.068896 | 0.056055 | 0.064078 | 0.057027 | 0.860830 | 1.000000 | 0.195731 | 0.002860 |
| Total_Ct_Chng_Q4_Q1 | -0.027871 | -0.026631 | -0.007484 | 0.093643 | -0.017100 | 0.347408 | 0.164096 | 0.195731 | 1.000000 | 0.084179 |
| Avg_Utilization_Ratio | 0.007108 | -0.007056 | -0.518439 | 0.624022 | -0.587529 | 0.031923 | -0.057683 | 0.002860 | 0.084179 | 1.000000 |
plt.figure(figsize=(30,7))
sns.pairplot(data = data, hue = 'Attrition_Flag')
<seaborn.axisgrid.PairGrid at 0x1b75eb194f0>
<Figure size 2160x504 with 0 Axes>
Observations:
Let's define one more function to plot stacked bar charts
### Function to plot stacked bar charts for categorical columns
def stacked_plot(x,flag=True):
sns.set(palette='bright')
tab1 = pd.crosstab(x,data['Attrition_Flag'],margins=True)
if flag==True:
print(tab1)
print('-'*120)
tab = pd.crosstab(x,data['Attrition_Flag'],normalize='index')
tab.plot(kind='bar',stacked=True,figsize=(10,5))
plt.legend(loc='lower left', frameon=False)
plt.legend(loc="upper left", bbox_to_anchor=(1,1))
plt.show()
stacked_plot(data[cat_col[1]])
Attrition_Flag Attrited Customer Existing Customer All Gender F 930 4428 5358 M 697 4072 4769 All 1627 8500 10127 ------------------------------------------------------------------------------------------------------------------------
Observations:
stacked_plot(data[cat_col[2]])
Attrition_Flag Attrited Customer Existing Customer All Dependent_count 0 135 769 904 1 269 1569 1838 2 417 2238 2655 3 482 2250 2732 4 260 1314 1574 5 64 360 424 All 1627 8500 10127 ------------------------------------------------------------------------------------------------------------------------
Observations:
stacked_plot(data[cat_col[3]])
Attrition_Flag Attrited Customer Existing Customer All Education_Level College 154 859 1013 Doctorate 95 356 451 Graduate 487 2641 3128 High School 306 1707 2013 Post-Graduate 92 424 516 Uneducated 237 1250 1487 All 1371 7237 8608 ------------------------------------------------------------------------------------------------------------------------
Observations:
stacked_plot(data[cat_col[4]])
Attrition_Flag Attrited Customer Existing Customer All Marital_Status Divorced 121 627 748 Married 709 3978 4687 Single 668 3275 3943 All 1498 7880 9378 ------------------------------------------------------------------------------------------------------------------------
Observations:
stacked_plot(data[cat_col[5]])
Attrition_Flag Attrited Customer Existing Customer All Income_Category $120K + 126 601 727 $40K - $60K 271 1519 1790 $60K - $80K 189 1213 1402 $80K - $120K 242 1293 1535 Less than $40K 612 2949 3561 All 1440 7575 9015 ------------------------------------------------------------------------------------------------------------------------
Observation:
stacked_plot(data[cat_col[6]])
Attrition_Flag Attrited Customer Existing Customer All Card_Category Blue 1519 7917 9436 Gold 21 95 116 Platinum 5 15 20 Silver 82 473 555 All 1627 8500 10127 ------------------------------------------------------------------------------------------------------------------------
Observations:
stacked_plot(data[cat_col[7]])
Attrition_Flag Attrited Customer Existing Customer All Total_Relationship_Count 1 233 677 910 2 346 897 1243 3 400 1905 2305 4 225 1687 1912 5 227 1664 1891 6 196 1670 1866 All 1627 8500 10127 ------------------------------------------------------------------------------------------------------------------------
Observations:
stacked_plot(data[cat_col[8]])
Attrition_Flag Attrited Customer Existing Customer All Months_Inactive_12_mon 0 15 14 29 1 100 2133 2233 2 505 2777 3282 3 826 3020 3846 4 130 305 435 5 32 146 178 6 19 105 124 All 1627 8500 10127 ------------------------------------------------------------------------------------------------------------------------
Observations:
stacked_plot(data[cat_col[9]])
Attrition_Flag Attrited Customer Existing Customer All Contacts_Count_12_mon 0 7 392 399 1 108 1391 1499 2 403 2824 3227 3 681 2699 3380 4 315 1077 1392 5 59 117 176 6 54 0 54 All 1627 8500 10127 ------------------------------------------------------------------------------------------------------------------------
Observations:
cols = data[['Customer_Age', 'Months_on_book', 'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']].columns.tolist()
plt.figure(figsize=(20,20))
for i, variable in enumerate(cols):
plt.subplot(4,3,i+1)
sns.boxplot(data['Attrition_Flag'], data[variable],palette="PuBu")
plt.tight_layout()
plt.title(variable)
plt.show()
Observations:
data[(data['Attrition_Flag']== 'Existing Customer')].describe(include='all').T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Attrition_Flag | 8500 | 1 | Existing Customer | 8500 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Customer_Age | 8500 | NaN | NaN | NaN | 46.2614 | 8.07905 | 26 | 41 | 46 | 52 | 68.5 |
| Gender | 8500 | 2 | F | 4428 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Dependent_count | 8500 | 6 | 3 | 2250 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Education_Level | 7237 | 6 | Graduate | 2641 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Marital_Status | 7880 | 3 | Married | 3978 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Income_Category | 7575 | 5 | Less than $40K | 2949 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Card_Category | 8500 | 4 | Blue | 7917 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Months_on_book | 8500 | NaN | NaN | NaN | 35.8994 | 7.80276 | 17.5 | 31 | 36 | 40 | 53.5 |
| Total_Relationship_Count | 8500 | 6 | 3 | 1905 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Months_Inactive_12_mon | 8500 | 7 | 3 | 3020 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Contacts_Count_12_mon | 8500 | 6 | 2 | 2824 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Credit_Limit | 8500 | NaN | NaN | NaN | 7981.04 | 7234.72 | 1438.3 | 2602 | 4643.5 | 11252.8 | 23836.2 |
| Total_Revolving_Bal | 8500 | NaN | NaN | NaN | 1256.6 | 757.745 | 0 | 800 | 1364 | 1807 | 2517 |
| Avg_Open_To_Buy | 8500 | NaN | NaN | NaN | 6735.48 | 7264.9 | 15 | 1184.5 | 3469.5 | 9978.25 | 22660.8 |
| Total_Amt_Chng_Q4_Q1 | 8500 | NaN | NaN | NaN | 0.761765 | 0.178332 | 0.289 | 0.643 | 0.743 | 0.86 | 1.201 |
| Total_Trans_Amt | 8500 | NaN | NaN | NaN | 4118.34 | 2109.58 | 816 | 2384.75 | 4100 | 4781.25 | 8619.25 |
| Total_Trans_Ct | 8500 | NaN | NaN | NaN | 68.6718 | 22.9166 | 11 | 54 | 71 | 82 | 135 |
| Total_Ct_Chng_Q4_Q1 | 8500 | NaN | NaN | NaN | 0.731716 | 0.18199 | 0.228 | 0.617 | 0.721 | 0.833 | 1.172 |
| Avg_Utilization_Ratio | 8500 | NaN | NaN | NaN | 0.296412 | 0.272568 | 0 | 0.055 | 0.211 | 0.52925 | 0.994 |
Observations on customer profile for Existing customers:
sns.set(rc={'figure.figsize':(7,7)})
sns.heatmap(data.corr(),
annot=True,
linewidths=.5,
center=0,
cbar=False,
vmin=0, vmax=0.5,
fmt='0.2f')
plt.show()
Observations:
df = data.copy() #copying the dataset
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Attrition_Flag 10127 non-null category 1 Customer_Age 10127 non-null float64 2 Gender 10127 non-null category 3 Dependent_count 10127 non-null category 4 Education_Level 8608 non-null category 5 Marital_Status 9378 non-null category 6 Income_Category 9015 non-null category 7 Card_Category 10127 non-null category 8 Months_on_book 10127 non-null float64 9 Total_Relationship_Count 10127 non-null category 10 Months_Inactive_12_mon 10127 non-null category 11 Contacts_Count_12_mon 10127 non-null category 12 Credit_Limit 10127 non-null float64 13 Total_Revolving_Bal 10127 non-null int64 14 Avg_Open_To_Buy 10127 non-null float64 15 Total_Amt_Chng_Q4_Q1 10127 non-null float64 16 Total_Trans_Amt 10127 non-null float64 17 Total_Trans_Ct 10127 non-null int64 18 Total_Ct_Chng_Q4_Q1 10127 non-null float64 19 Avg_Utilization_Ratio 10127 non-null float64 dtypes: category(10), float64(8), int64(2) memory usage: 892.2 KB
df.head()
| Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Existing Customer | 45.0 | M | 3 | High School | Married | $60K - $80K | Blue | 39.0 | 5 | 1 | 3 | 12691.0 | 777 | 11914.0 | 1.201 | 1144.0 | 42 | 1.172 | 0.061 |
| 1 | Existing Customer | 49.0 | F | 5 | Graduate | Single | Less than $40K | Blue | 44.0 | 6 | 1 | 2 | 8256.0 | 864 | 7392.0 | 1.201 | 1291.0 | 33 | 1.172 | 0.105 |
| 2 | Existing Customer | 51.0 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36.0 | 4 | 1 | 0 | 3418.0 | 0 | 3418.0 | 1.201 | 1887.0 | 20 | 1.172 | 0.000 |
| 3 | Existing Customer | 40.0 | F | 4 | High School | NaN | Less than $40K | Blue | 34.0 | 3 | 4 | 1 | 3313.0 | 2517 | 796.0 | 1.201 | 1171.0 | 20 | 1.172 | 0.760 |
| 4 | Existing Customer | 40.0 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21.0 | 5 | 1 | 0 | 4716.0 | 0 | 4716.0 | 1.201 | 816.0 | 28 | 1.172 | 0.000 |
for i in cat_col:
print(data[i].value_counts()) #printing the unique values of categorical variables
print('-'*50)
Existing Customer 8500 Attrited Customer 1627 Name: Attrition_Flag, dtype: int64 -------------------------------------------------- F 5358 M 4769 Name: Gender, dtype: int64 -------------------------------------------------- 3 2732 2 2655 1 1838 4 1574 0 904 5 424 Name: Dependent_count, dtype: int64 -------------------------------------------------- Graduate 3128 High School 2013 Uneducated 1487 College 1013 Post-Graduate 516 Doctorate 451 Name: Education_Level, dtype: int64 -------------------------------------------------- Married 4687 Single 3943 Divorced 748 Name: Marital_Status, dtype: int64 -------------------------------------------------- Less than $40K 3561 $40K - $60K 1790 $80K - $120K 1535 $60K - $80K 1402 $120K + 727 Name: Income_Category, dtype: int64 -------------------------------------------------- Blue 9436 Silver 555 Gold 116 Platinum 20 Name: Card_Category, dtype: int64 -------------------------------------------------- 3 2305 4 1912 5 1891 6 1866 2 1243 1 910 Name: Total_Relationship_Count, dtype: int64 -------------------------------------------------- 3 3846 2 3282 1 2233 4 435 5 178 6 124 0 29 Name: Months_Inactive_12_mon, dtype: int64 -------------------------------------------------- 3 3380 2 3227 1 1499 4 1392 0 399 5 176 6 54 Name: Contacts_Count_12_mon, dtype: int64 --------------------------------------------------
df.isnull().sum()
Attrition_Flag 0 Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 1519 Marital_Status 749 Income_Category 1112 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64
KNNImputer: Each sample's missing values are imputed by looking at the n_neighbors nearest neighbors found in the training set. Default value for n_neighbors=5.imputer = KNNImputer(n_neighbors=5) #imputer
# defining a list with names of columns that will be used for imputation
reqd_col_for_impute = [
"Education_Level",
"Marital_Status",
"Income_Category",
]
df[reqd_col_for_impute].head()
| Education_Level | Marital_Status | Income_Category | |
|---|---|---|---|
| 0 | High School | Married | $60K - $80K |
| 1 | Graduate | Single | Less than $40K |
| 2 | Graduate | Married | $80K - $120K |
| 3 | High School | NaN | Less than $40K |
| 4 | Uneducated | Married | $60K - $80K |
# we need to pass numerical values for each categorical column for KNN imputation so we will label encode them
Gender = {"F": 0, "M": 1}
df["Gender"] = df["Gender"].map(Gender)
Education_Level= {"Uneducated": 0, "High School": 1, "College": 2, "Post-Graduate": 3, "Graduate" : 4, "Doctorate": 5}
df["Education_Level"] = df["Education_Level"].map(Education_Level)
Marital_Status = {
"Single": 0,
"Married": 1,
"Divorced": 2,
}
df["Marital_Status"] = df["Marital_Status"].map(Marital_Status)
Income_Category = {
"Less than $40K": 0,
"$40K - $60K": 1,
"$60K - $80K": 2,
"$80K - $120K": 3,
"$120K +": 4,
}
df["Income_Category"] = df["Income_Category"].map(Income_Category)
Card_Category = {
"Blue": 0,
"Silver": 1,
"Gold": 2,
"Platinum": 3,
}
df["Card_Category"] = df["Card_Category"].map(Card_Category)
df.head()
| Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Existing Customer | 45.0 | 1 | 3 | 1 | 1 | 2 | 0 | 39.0 | 5 | 1 | 3 | 12691.0 | 777 | 11914.0 | 1.201 | 1144.0 | 42 | 1.172 | 0.061 |
| 1 | Existing Customer | 49.0 | 0 | 5 | 4 | 0 | 0 | 0 | 44.0 | 6 | 1 | 2 | 8256.0 | 864 | 7392.0 | 1.201 | 1291.0 | 33 | 1.172 | 0.105 |
| 2 | Existing Customer | 51.0 | 1 | 3 | 4 | 1 | 3 | 0 | 36.0 | 4 | 1 | 0 | 3418.0 | 0 | 3418.0 | 1.201 | 1887.0 | 20 | 1.172 | 0.000 |
| 3 | Existing Customer | 40.0 | 0 | 4 | 1 | NaN | 0 | 0 | 34.0 | 3 | 4 | 1 | 3313.0 | 2517 | 796.0 | 1.201 | 1171.0 | 20 | 1.172 | 0.760 |
| 4 | Existing Customer | 40.0 | 1 | 3 | 0 | 1 | 2 | 0 | 21.0 | 5 | 1 | 0 | 4716.0 | 0 | 4716.0 | 1.201 | 816.0 | 28 | 1.172 | 0.000 |
X = df.drop(["Attrition_Flag"], axis=1)
y = df["Attrition_Flag"].apply(lambda x: 1 if x == "Existing Customer" else 0)
# Splitting data into training, validation and test set:
# first we split data into 2 parts, say temporary and test
X_temp, X_test, y_temp, y_test = train_test_split(
X, y, test_size=0.2, random_state=1, stratify=y
)
# then we split the temporary set into train and validation
X_train, X_val, y_train, y_val = train_test_split(
X_temp, y_temp, test_size=0.25, random_state=1, stratify=y_temp
)
print(X_train.shape, X_val.shape, X_test.shape)
(6075, 19) (2026, 19) (2026, 19)
print("Number of rows in train data =", X_train.shape[0])
print("Number of rows in validation data =", X_val.shape[0])
print("Number of rows in test data =", X_test.shape[0])
Number of rows in train data = 6075 Number of rows in validation data = 2026 Number of rows in test data = 2026
Imputing Missing Values
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Attrition_Flag 10127 non-null category 1 Customer_Age 10127 non-null float64 2 Gender 10127 non-null category 3 Dependent_count 10127 non-null category 4 Education_Level 8608 non-null category 5 Marital_Status 9378 non-null category 6 Income_Category 9015 non-null category 7 Card_Category 10127 non-null category 8 Months_on_book 10127 non-null float64 9 Total_Relationship_Count 10127 non-null category 10 Months_Inactive_12_mon 10127 non-null category 11 Contacts_Count_12_mon 10127 non-null category 12 Credit_Limit 10127 non-null float64 13 Total_Revolving_Bal 10127 non-null int64 14 Avg_Open_To_Buy 10127 non-null float64 15 Total_Amt_Chng_Q4_Q1 10127 non-null float64 16 Total_Trans_Amt 10127 non-null float64 17 Total_Trans_Ct 10127 non-null int64 18 Total_Ct_Chng_Q4_Q1 10127 non-null float64 19 Avg_Utilization_Ratio 10127 non-null float64 dtypes: category(10), float64(8), int64(2) memory usage: 892.2 KB
# Fit and transform the train data
X_train[reqd_col_for_impute] = imputer.fit_transform(X_train[reqd_col_for_impute])
# Transform the train data
X_val[reqd_col_for_impute] = imputer.fit_transform(X_val[reqd_col_for_impute])
# Transform the test data
X_test[reqd_col_for_impute] = imputer.transform(X_test[reqd_col_for_impute])
# Checking that no column has missing values in train, validation or test sets
print(X_train.isna().sum())
print("-" * 30)
print(X_val.isna().sum())
print("-" * 30)
print(X_test.isna().sum())
Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 0 Marital_Status 0 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64 ------------------------------ Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 0 Marital_Status 0 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64 ------------------------------ Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 0 Marital_Status 0 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64
X_train.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 6075 entries, 9501 to 703 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer_Age 6075 non-null float64 1 Gender 6075 non-null category 2 Dependent_count 6075 non-null category 3 Education_Level 6075 non-null float64 4 Marital_Status 6075 non-null float64 5 Income_Category 6075 non-null float64 6 Card_Category 6075 non-null category 7 Months_on_book 6075 non-null float64 8 Total_Relationship_Count 6075 non-null category 9 Months_Inactive_12_mon 6075 non-null category 10 Contacts_Count_12_mon 6075 non-null category 11 Credit_Limit 6075 non-null float64 12 Total_Revolving_Bal 6075 non-null int64 13 Avg_Open_To_Buy 6075 non-null float64 14 Total_Amt_Chng_Q4_Q1 6075 non-null float64 15 Total_Trans_Amt 6075 non-null float64 16 Total_Trans_Ct 6075 non-null int64 17 Total_Ct_Chng_Q4_Q1 6075 non-null float64 18 Avg_Utilization_Ratio 6075 non-null float64 dtypes: category(6), float64(11), int64(2) memory usage: 701.5 KB
cat_col.remove('Attrition_Flag')
X_train = pd.get_dummies(X_train, columns = cat_col, drop_first=True)#creating dummy variables
X_val = pd.get_dummies(X_val,columns = cat_col, drop_first=True)
X_test = pd.get_dummies(X_test,columns = cat_col, drop_first=True)
print(X_train.shape, X_val.shape, X_test.shape)
(6075, 50) (2026, 50) (2026, 50)
X_train.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 6075 entries, 9501 to 703 Data columns (total 50 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer_Age 6075 non-null float64 1 Months_on_book 6075 non-null float64 2 Credit_Limit 6075 non-null float64 3 Total_Revolving_Bal 6075 non-null int64 4 Avg_Open_To_Buy 6075 non-null float64 5 Total_Amt_Chng_Q4_Q1 6075 non-null float64 6 Total_Trans_Amt 6075 non-null float64 7 Total_Trans_Ct 6075 non-null int64 8 Total_Ct_Chng_Q4_Q1 6075 non-null float64 9 Avg_Utilization_Ratio 6075 non-null float64 10 Gender_1 6075 non-null uint8 11 Dependent_count_1 6075 non-null uint8 12 Dependent_count_2 6075 non-null uint8 13 Dependent_count_3 6075 non-null uint8 14 Dependent_count_4 6075 non-null uint8 15 Dependent_count_5 6075 non-null uint8 16 Education_Level_0.0 6075 non-null uint8 17 Education_Level_1.0 6075 non-null uint8 18 Education_Level_2.0 6075 non-null uint8 19 Education_Level_3.0 6075 non-null uint8 20 Education_Level_4.0 6075 non-null uint8 21 Education_Level_5.0 6075 non-null uint8 22 Marital_Status_0.0 6075 non-null uint8 23 Marital_Status_1.0 6075 non-null uint8 24 Marital_Status_2.0 6075 non-null uint8 25 Income_Category_0.0 6075 non-null uint8 26 Income_Category_1.0 6075 non-null uint8 27 Income_Category_2.0 6075 non-null uint8 28 Income_Category_3.0 6075 non-null uint8 29 Income_Category_4.0 6075 non-null uint8 30 Card_Category_2 6075 non-null uint8 31 Card_Category_3 6075 non-null uint8 32 Card_Category_1 6075 non-null uint8 33 Total_Relationship_Count_2 6075 non-null uint8 34 Total_Relationship_Count_3 6075 non-null uint8 35 Total_Relationship_Count_4 6075 non-null uint8 36 Total_Relationship_Count_5 6075 non-null uint8 37 Total_Relationship_Count_6 6075 non-null uint8 38 Months_Inactive_12_mon_1 6075 non-null uint8 39 Months_Inactive_12_mon_2 6075 non-null uint8 40 Months_Inactive_12_mon_3 6075 non-null uint8 41 Months_Inactive_12_mon_4 6075 non-null uint8 42 Months_Inactive_12_mon_5 6075 non-null uint8 43 Months_Inactive_12_mon_6 6075 non-null uint8 44 Contacts_Count_12_mon_1 6075 non-null uint8 45 Contacts_Count_12_mon_2 6075 non-null uint8 46 Contacts_Count_12_mon_3 6075 non-null uint8 47 Contacts_Count_12_mon_4 6075 non-null uint8 48 Contacts_Count_12_mon_5 6075 non-null uint8 49 Contacts_Count_12_mon_6 6075 non-null uint8 dtypes: float64(8), int64(2), uint8(40) memory usage: 759.4 KB
y_train.value_counts()
1 5099 0 976 Name: Attrition_Flag, dtype: int64
For model building we will be first building 6 models from Logistic regression, Bagging, GBM, AdaBoost, XGBoost and decision tree, we will run this with K-cross validation and then on the validation data set and we will measure the recall score of each of the models. From the 6 models, we will choose the 3 best models with the best recall score on the validation set.
models = [] # Empty list to store all the models
# Appending models into the list
models.append(("Bagging", BaggingClassifier(random_state=1)))
models.append(("Logistic_Regression", LogisticRegression(random_state=1)))
models.append(("GBM", GradientBoostingClassifier(random_state=1)))
models.append(("Adaboost", AdaBoostClassifier(random_state=1)))
models.append(("Xgboost", XGBClassifier(random_state=1, eval_metric="logloss")))
models.append(("dtree", DecisionTreeClassifier(random_state=1)))
results = [] # Empty list to store all model's CV scores
names = [] # Empty list to store name of the models
score = []
# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation Performance:" "\n")
for name, model in models:
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result = cross_val_score(
estimator=model, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
results.append(cv_result)
names.append(name)
print("{}: {}".format(name, cv_result.mean() * 100))
print("\n" "Validation Performance:" "\n")
for name, model in models:
model.fit(X_train, y_train)
scores = recall_score(y_val, model.predict(X_val))
score.append(scores)
print("{}: {}".format(name, scores))
Cross-Validation Performance: Bagging: 97.0977313398372 Logistic_Regression: 96.92102984471511 GBM: 98.78408281860341 Adaboost: 97.94081086801748 Xgboost: 98.66637803305818 dtree: 96.29365583328521 Validation Performance: Bagging: 0.9788235294117648 Logistic_Regression: 0.97 GBM: 0.9894117647058823 Adaboost: 0.9823529411764705 Xgboost: 0.99 dtree: 0.9594117647058824
# Plotting boxplots for CV scores of all models defined above
fig = plt.figure(figsize=(10, 7))
fig.suptitle("Algorithm Comparison")
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
Observations:
We will tune Adaboost, Gradient boosting and xgboost models using RandomizedSearchCV.
First, let's create two functions to calculate different metrics and confusion matrix so that we don't have to use the same code repeatedly for each model.
# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target):
"""
Function to compute different metrics to check classification model performance
model: classifier
predictors: independent variables
target: dependent variable
"""
# predicting using the independent variables
pred = model.predict(predictors)
acc = accuracy_score(target, pred) # to compute Accuracy
recall = recall_score(target, pred) # to compute Recall
precision = precision_score(target, pred) # to compute Precision
f1 = f1_score(target, pred) # to compute F1-score
# creating a dataframe of metrics
df_perf = pd.DataFrame(
{
"Accuracy": acc,
"Recall": recall,
"Precision": precision,
"F1": f1,
},
index=[0],
)
return df_perf
def confusion_matrix_sklearn(model, predictors, target):
"""
To plot the confusion_matrix with percentages
model: classifier
predictors: independent variables
target: dependent variable
"""
y_pred = model.predict(predictors)
cm = confusion_matrix(target, y_pred)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
# defining model
model = AdaBoostClassifier(random_state=1)
# Parameter grid to pass in GridSearchCV
param_grid = {
"n_estimators": np.arange(10, 110, 10),
"learning_rate": [0.1, 0.01, 0.2, 0.05, 1],
"base_estimator": [
DecisionTreeClassifier(max_depth=1, random_state=1),
DecisionTreeClassifier(max_depth=2, random_state=1),
DecisionTreeClassifier(max_depth=3, random_state=1),
],
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_jobs = -1, n_iter=50, scoring=scorer, cv=5, random_state=1)
#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train,y_train)
print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))
Best parameters are {'n_estimators': 50, 'learning_rate': 0.01, 'base_estimator': DecisionTreeClassifier(max_depth=1, random_state=1)} with CV score=1.0:
# building model with best parameters
adb_tuned2 = AdaBoostClassifier(
n_estimators=50,
learning_rate= 0.01,
random_state=1,
base_estimator=DecisionTreeClassifier(max_depth=1, random_state=1),
)
# Fit the model on training data
adb_tuned2.fit(X_train, y_train)
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1,
random_state=1),
learning_rate=0.01, random_state=1)
# Calculating different metrics on train set
Adaboost_random_train = model_performance_classification_sklearn(
adb_tuned2, X_train, y_train
)
print("Training performance:")
Adaboost_random_train
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.839342 | 1.0 | 0.839342 | 0.912654 |
# Calculating different metrics on validation set
Adaboost_random_val = model_performance_classification_sklearn(adb_tuned2, X_val, y_val)
print("Validation performance:")
Adaboost_random_val
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.839092 | 1.0 | 0.839092 | 0.912507 |
# creating confusion matrix
confusion_matrix_sklearn(adb_tuned2, X_val, y_val)
Observations:
# Choose the type of classifier.
gbc_tuned = GradientBoostingClassifier(init=AdaBoostClassifier(random_state=1),random_state=1)
# Grid of parameters to choose from
parameters = {
"n_estimators": [100,150,200,250],
"subsample":[0.8,0.9,1],
"max_features":[0.7,0.8,0.9,1]
}
# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)
# Run the grid search
gbc_tuned_rc = RandomizedSearchCV(estimator = gbc_tuned,param_distributions = parameters, n_jobs = -1, n_iter = 50, scoring=acc_scorer,cv=5, random_state=1)
gbc_tuned_rc.fit(X_train, y_train)
# Set the clf to the best combination of parameters
print(gbc_tuned_rc.best_estimator_)
print("Best parameters are {} with CV score={}:" .format(gbc_tuned_rc.best_params_,gbc_tuned_rc.best_score_))
GradientBoostingClassifier(init=AdaBoostClassifier(random_state=1),
max_features=1, random_state=1, subsample=0.8)
Best parameters are {'subsample': 0.8, 'n_estimators': 100, 'max_features': 1} with CV score=0.9925470953837865:
gbc_tuned_rc = GradientBoostingClassifier(
random_state = 1,
subsample = 0.9,
n_estimators = 100,
max_features = 1,
)
gbc_tuned_rc.fit(X_train, y_train)
GradientBoostingClassifier(max_features=1, random_state=1, subsample=0.9)
# Calculating different metrics on train set
gbc_tuned_rc_random_train = model_performance_classification_sklearn(
gbc_tuned_rc, X_train, y_train
)
print("Training performance:")
gbc_tuned_rc_random_train
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.907325 | 0.993528 | 0.905289 | 0.947359 |
# Calculating different metrics on validation set
gbc_random_val = model_performance_classification_sklearn(gbc_tuned_rc, X_val, y_val)
print("Validation performance:")
gbc_random_val
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.898322 | 0.993529 | 0.896497 | 0.942522 |
# creating confusion matrix
confusion_matrix_sklearn(gbc_tuned_rc, X_val, y_val)
Observations:
# defining model
model = XGBClassifier(random_state=1,eval_metric='logloss')
# Parameter grid to pass in RandomizedSearchCV
param_grid={'n_estimators':np.arange(10,150,20),
'scale_pos_weight':[2,5,10],
'learning_rate':[0.01,0.1,0.2,0.05],
'gamma':[0,1,3,5],
'subsample':[0.5, 0.7, 0.8,0.9,1],
'max_depth':np.arange(1,5,1),
'reg_lambda':[5,10],
"colsample_bytree":[0.5,0.7,0.9,1],
"colsample_bylevel":[0.5,0.7,0.9,1]
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
xgb_tuned2 = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=50, scoring=scorer, cv=5, random_state=1, n_jobs = -1)
#Fitting parameters in RandomizedSearchCV
xgb_tuned2.fit(X_train,y_train)
print("Best parameters are {} with CV score={}:" .format(xgb_tuned2.best_params_,xgb_tuned2.best_score_))
Best parameters are {'subsample': 1, 'scale_pos_weight': 10, 'reg_lambda': 10, 'n_estimators': 30, 'max_depth': 1, 'learning_rate': 0.2, 'gamma': 0, 'colsample_bytree': 0.5, 'colsample_bylevel': 0.9} with CV score=1.0:
# building model with best parameters
xgb_tuned2 = XGBClassifier(
random_state=1,
n_estimators=90,
scale_pos_weight=10,
gamma=3,
subsample=0.8,
learning_rate=0.01,
max_depth=2,
reg_lambda=10,
colsample_bytree = 0.7,
colsample_bylevel = 0.9,
eval_metric='logloss'
)
# Fit the model on training data
xgb_tuned2.fit(X_train, y_train)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.9,
colsample_bynode=1, colsample_bytree=0.7, eval_metric='logloss',
gamma=3, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.01, max_delta_step=0,
max_depth=2, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=90, n_jobs=8,
num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=10,
scale_pos_weight=10, subsample=0.8, tree_method='exact',
validate_parameters=1, verbosity=None)
# Calculating different metrics on train set
xgboost_random_train = model_performance_classification_sklearn(
xgb_tuned2, X_train, y_train
)
print("Training performance:")
xgboost_random_train
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.84856 | 1.0 | 0.847151 | 0.917251 |
# Calculating different metrics on validation set
xgboost_random_val = model_performance_classification_sklearn(xgb_tuned2, X_val, y_val)
print("Validation performance:")
xgboost_random_val
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.845015 | 1.0 | 0.844091 | 0.915455 |
# creating confusion matrix
confusion_matrix_sklearn(xgb_tuned2, X_val, y_val)
Observation:
print("Before UpSampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before UpSampling, counts of label 'No': {} \n".format(sum(y_train == 0)))
sm = SMOTE(
sampling_strategy=1, k_neighbors=5, random_state=1
) # Synthetic Minority Over Sampling Technique
X_train_over, y_train_over = sm.fit_resample(X_train, y_train)
print("After UpSampling, counts of label 'Yes': {}".format(sum(y_train_over == 1)))
print("After UpSampling, counts of label 'No': {} \n".format(sum(y_train_over == 0)))
print("After UpSampling, the shape of train_X: {}".format(X_train_over.shape))
print("After UpSampling, the shape of train_y: {} \n".format(y_train_over.shape))
Before UpSampling, counts of label 'Yes': 5099 Before UpSampling, counts of label 'No': 976 After UpSampling, counts of label 'Yes': 5099 After UpSampling, counts of label 'No': 5099 After UpSampling, the shape of train_X: (10198, 50) After UpSampling, the shape of train_y: (10198,)
adb_tuned2_over = AdaBoostClassifier(
n_estimators=50,
learning_rate= 0.01,
random_state=1,
base_estimator=DecisionTreeClassifier(max_depth=1, random_state=1),
)
# Fit the model on training data
adb_tuned2_over.fit(X_train_over, y_train_over)
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1,
random_state=1),
learning_rate=0.01, random_state=1)
# Calculating different metrics on train set
Adaboost_random_train_over = model_performance_classification_sklearn(
adb_tuned2_over, X_train_over, y_train_over
)
print("Training performance for oversampled data:")
Adaboost_random_train_over
Training performance for oversampled data:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.791332 | 0.743479 | 0.822164 | 0.780844 |
# Calculating different metrics on validation set
Adaboost_random_val_over = model_performance_classification_sklearn(adb_tuned2_over, X_val, y_val)
print("Validation performance:")
Adaboost_random_val_over
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.753702 | 0.748235 | 0.947133 | 0.836017 |
confusion_matrix_sklearn(adb_tuned2_over, X_train_over, y_train_over)
Observations:
gbc_tuned_rc_over = GradientBoostingClassifier(
random_state = 1,
subsample = 0.9,
n_estimators = 100,
max_features = 1,
)
gbc_tuned_rc_over.fit(X_train_over, y_train_over)
GradientBoostingClassifier(max_features=1, random_state=1, subsample=0.9)
# Calculating different metrics on train set
gbc_tuned_rc_random_train_over = model_performance_classification_sklearn(
gbc_tuned_rc_over, X_train_over, y_train_over
)
print("Training performance on Oversampled data:")
gbc_tuned_rc_random_train_over
Training performance on Oversampled data:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.937635 | 0.958423 | 0.920166 | 0.938905 |
# Calculating different metrics on validation set
gbc_random_val_over = model_performance_classification_sklearn(gbc_tuned_rc_over, X_val, y_val)
print("Validation performance:")
gbc_random_val_over
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.894373 | 0.955294 | 0.92168 | 0.938186 |
confusion_matrix_sklearn(gbc_tuned_rc_over, X_train_over, y_train_over)
Observations:
# building model with best parameters
xgb_tuned2_over = XGBClassifier(
random_state=1,
n_estimators=90,
scale_pos_weight=10,
gamma=3,
subsample=0.8,
learning_rate=0.01,
max_depth=2,
reg_lambda=10,
colsample_bytree = 0.7,
colsample_bylevel = 0.9,
eval_metric='logloss'
)
# Fit the model on training data
xgb_tuned2_over.fit(X_train_over, y_train_over)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.9,
colsample_bynode=1, colsample_bytree=0.7, eval_metric='logloss',
gamma=3, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.01, max_delta_step=0,
max_depth=2, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=90, n_jobs=8,
num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=10,
scale_pos_weight=10, subsample=0.8, tree_method='exact',
validate_parameters=1, verbosity=None)
# Calculating different metrics on train set
xgboost_random_train_over = model_performance_classification_sklearn(
xgb_tuned2_over, X_train_over, y_train_over
)
print("Training performance on Oversampled data:")
xgboost_random_train_over
Training performance on Oversampled data:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.671995 | 0.991371 | 0.604955 | 0.751394 |
# Calculating different metrics on validation set
xgboost_random_val_over = model_performance_classification_sklearn(xgb_tuned2_over, X_val, y_val)
print("Validation performance:")
xgboost_random_val_over
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.883514 | 0.992353 | 0.883246 | 0.934626 |
confusion_matrix_sklearn(xgb_tuned2_over, X_train_over, y_train_over)
Observations:
rus = RandomUnderSampler(random_state=1)
X_train_under, y_train_under = rus.fit_resample(X_train, y_train)
print("Before Under Sampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before Under Sampling, counts of label 'No': {} \n".format(sum(y_train == 0)))
print("After Under Sampling, counts of label 'Yes': {}".format(sum(y_train_under == 1)))
print("After Under Sampling, counts of label 'No': {} \n".format(sum(y_train_under == 0)))
print("After Under Sampling, the shape of train_X: {}".format(X_train_under.shape))
print("After Under Sampling, the shape of train_y: {} \n".format(y_train_under.shape))
Before Under Sampling, counts of label 'Yes': 5099 Before Under Sampling, counts of label 'No': 976 After Under Sampling, counts of label 'Yes': 976 After Under Sampling, counts of label 'No': 976 After Under Sampling, the shape of train_X: (1952, 50) After Under Sampling, the shape of train_y: (1952,)
adb_tuned2_under = AdaBoostClassifier(
n_estimators=50,
learning_rate= 0.01,
random_state=1,
base_estimator=DecisionTreeClassifier(max_depth=1, random_state=1),
)
# Fit the model on training data
adb_tuned2_under.fit(X_train_under, y_train_under)
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1,
random_state=1),
learning_rate=0.01, random_state=1)
# Calculating different metrics on train set
Adaboost_random_train_under = model_performance_classification_sklearn(
adb_tuned2_under, X_train_under, y_train_under
)
print("Training performance for Undersampled data:")
Adaboost_random_train_under
Training performance for Undersampled data:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.779201 | 0.695697 | 0.835178 | 0.759083 |
# Calculating different metrics on validation set
Adaboost_random_val_under = model_performance_classification_sklearn(adb_tuned2_under, X_val, y_val)
print("Validation performance for Undersampled Data:")
Adaboost_random_val_under
Validation performance for Undersampled Data:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.740375 | 0.721765 | 0.958594 | 0.82349 |
confusion_matrix_sklearn(adb_tuned2_under, X_train_under, y_train_under)
Observations:
gbc_tuned_rc_under = GradientBoostingClassifier(
random_state = 1,
subsample = 0.9,
n_estimators = 100,
max_features = 1,
)
gbc_tuned_rc_under.fit(X_train_under, y_train_under)
GradientBoostingClassifier(max_features=1, random_state=1, subsample=0.9)
# Calculating different metrics on train set
gbc_tuned_rc_random_train_under = model_performance_classification_sklearn(
gbc_tuned_rc_under, X_train_under, y_train_under
)
print("Training performance on Undersampled data:")
gbc_tuned_rc_random_train_under
Training performance on Undersampled data:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.876537 | 0.884221 | 0.870838 | 0.877478 |
# Calculating different metrics on validation set
gbc_random_val_under = model_performance_classification_sklearn(gbc_tuned_rc_under, X_val, y_val)
print("Validation performance on Undersampled data:")
gbc_random_val_under
Validation performance on Undersampled data:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.859329 | 0.871176 | 0.957337 | 0.912227 |
confusion_matrix_sklearn(gbc_tuned_rc_under, X_train_under, y_train_under)
Observations:
# building model with best parameters
xgb_tuned2_under = XGBClassifier(
random_state=1,
n_estimators=90,
scale_pos_weight=10,
gamma=3,
subsample=0.8,
learning_rate=0.01,
max_depth=2,
reg_lambda=10,
colsample_bytree = 0.7,
colsample_bylevel = 0.9,
eval_metric='logloss'
)
# Fit the model on training data
xgb_tuned2_under.fit(X_train_under, y_train_under)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.9,
colsample_bynode=1, colsample_bytree=0.7, eval_metric='logloss',
gamma=3, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.01, max_delta_step=0,
max_depth=2, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=90, n_jobs=8,
num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=10,
scale_pos_weight=10, subsample=0.8, tree_method='exact',
validate_parameters=1, verbosity=None)
# Calculating different metrics on train set
xgboost_random_train_under = model_performance_classification_sklearn(
xgb_tuned2_under, X_train_under, y_train_under
)
print("Training performance on Undersampled data:")
xgboost_random_train_under
Training performance on Undersampled data:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.625512 | 0.989754 | 0.572614 | 0.725498 |
# Calculating different metrics on validation set
xgboost_random_val_under = model_performance_classification_sklearn(xgb_tuned2_under, X_val, y_val)
print("Validation performance on Undersampled data:")
xgboost_random_val_under
Validation performance on Undersampled data:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.871175 | 0.992941 | 0.871451 | 0.928238 |
confusion_matrix_sklearn(xgb_tuned2_under, X_train_under, y_train_under)
Observations:
# training performance comparison
models_train_comp_df = pd.concat(
[
Adaboost_random_train.T,
Adaboost_random_train_over.T,
Adaboost_random_train_under.T,
gbc_tuned_rc_random_train.T,
gbc_tuned_rc_random_train_over.T,
gbc_tuned_rc_random_train_under.T,
xgboost_random_train.T,
xgboost_random_train_over.T,
xgboost_random_train_under.T,
],
axis=1,
)
models_train_comp_df.columns = [
"AdaBoost Tuned with Random search",
"AdaBoost on Oversampled data",
"AdaBoost on Undersampled data",
"GBC Tuned with Random Search",
"GBC on Oversampled data",
"GBC on Undersampled data",
"Xgboost Tuned with Random Search",
"Xgboost on Oversampled data",
"Xgboost on Undersampled data",
]
print("Training performance comparison:")
models_train_comp_df
Training performance comparison:
| AdaBoost Tuned with Random search | AdaBoost on Oversampled data | AdaBoost on Undersampled data | GBC Tuned with Random Search | GBC on Oversampled data | GBC on Undersampled data | Xgboost Tuned with Random Search | Xgboost on Oversampled data | Xgboost on Undersampled data | |
|---|---|---|---|---|---|---|---|---|---|
| Accuracy | 0.839342 | 0.791332 | 0.779201 | 0.907325 | 0.937635 | 0.876537 | 0.848560 | 0.671995 | 0.625512 |
| Recall | 1.000000 | 0.743479 | 0.695697 | 0.993528 | 0.958423 | 0.884221 | 1.000000 | 0.991371 | 0.989754 |
| Precision | 0.839342 | 0.822164 | 0.835178 | 0.905289 | 0.920166 | 0.870838 | 0.847151 | 0.604955 | 0.572614 |
| F1 | 0.912654 | 0.780844 | 0.759083 | 0.947359 | 0.938905 | 0.877478 | 0.917251 | 0.751394 | 0.725498 |
# validation performance comparison
models_val_comp_df = pd.concat(
[
Adaboost_random_val.T,
Adaboost_random_val_over.T,
Adaboost_random_val_under.T,
gbc_random_val.T,
gbc_random_val_over.T,
gbc_random_val_under.T,
xgboost_random_val.T,
xgboost_random_val_over.T,
xgboost_random_val_under.T,
],
axis=1,
)
models_val_comp_df.columns = [
"AdaBoost Tuned with Random search",
"AdaBoost on Oversampled data",
"AdaBoost on Undersampled data",
"GBC Tuned with Random Search",
"GBC on Oversampled data",
"GBC on Undersampled data",
"Xgboost Tuned with Random Search",
"Xgboost on Oversampled data",
"Xgboost on Undersampled data",
]
print("Validation data set performance comparison:")
models_val_comp_df
Validation data set performance comparison:
| AdaBoost Tuned with Random search | AdaBoost on Oversampled data | AdaBoost on Undersampled data | GBC Tuned with Random Search | GBC on Oversampled data | GBC on Undersampled data | Xgboost Tuned with Random Search | Xgboost on Oversampled data | Xgboost on Undersampled data | |
|---|---|---|---|---|---|---|---|---|---|
| Accuracy | 0.839092 | 0.753702 | 0.740375 | 0.898322 | 0.894373 | 0.859329 | 0.845015 | 0.883514 | 0.871175 |
| Recall | 1.000000 | 0.748235 | 0.721765 | 0.993529 | 0.955294 | 0.871176 | 1.000000 | 0.992353 | 0.992941 |
| Precision | 0.839092 | 0.947133 | 0.958594 | 0.896497 | 0.921680 | 0.957337 | 0.844091 | 0.883246 | 0.871451 |
| F1 | 0.912507 | 0.836017 | 0.823490 | 0.942522 | 0.938186 | 0.912227 | 0.915455 | 0.934626 | 0.928238 |
Observations:
# Calculating different metrics on the test set
gbc_random_val_test = model_performance_classification_sklearn(gbc_tuned_rc, X_test, y_test)
print("Test performance")
gbc_random_val_test
Test performance
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.901777 | 0.994709 | 0.899044 | 0.94446 |
# creating confusion matrix
confusion_matrix_sklearn(gbc_tuned_rc, X_test, y_test)
feature_names = X_test.columns
importances = gbc_tuned_rc.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
Observations:
# creating a list of categorical variables
categorical_features = ["Gender", 'Dependent_count', "Education_Level", "Marital_Status", "Income_Category", "Total_Relationship_Count", "Months_Inactive_12_mon", "Contacts_Count_12_mon" ]
# creating a transformer for categorical variables, which will first apply simple imputer and
#then do one hot encoding for categorical variables
categorical_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("onehot", OneHotEncoder(handle_unknown="ignore"))
]
)
# handle_unknown = "ignore", allows model to handle any unknown category in the test data
# combining categorical transformer and numerical transformer using a column transformer
preprocessor = ColumnTransformer(
transformers=[
("cat", categorical_transformer, categorical_features),
],
remainder="passthrough",
)
# remainder = "passthrough" has been used, it will allow variables that are present in original data
# but not in "numerical_columns" and "categorical_columns" to pass through the column transformer without any changes
# Separating target variable and other variables
X = df.drop(columns="Attrition_Flag")
Y = df["Attrition_Flag"]
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.30, random_state=1, stratify=Y
)
print(X_train.shape, X_test.shape)
(7088, 19) (3039, 19)
# Creating new pipeline with best parameters
model = Pipeline(steps=[("pre", preprocessor),("GBC",GradientBoostingClassifier(random_state = 1,subsample = 0.9,n_estimators = 100,max_features = 1)),])
# Fit the model on training data
model.fit(X_train, y_train)
Pipeline(steps=[('pre',
ColumnTransformer(remainder='passthrough',
transformers=[('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
['Gender', 'Dependent_count',
'Education_Level',
'Marital_Status',
'Income_Category',
'Total_Relationship_Count',
'Months_Inactive_12_mon',
'Contacts_Count_12_mon'])])),
('GBC',
GradientBoostingClassifier(max_features=1, random_state=1,
subsample=0.9))])
y_predict = model.predict(X_test)
model_score = model.score(X_test, y_test)
print("The model score for the Pipeline is: {:.2f}".format(model_score))
The model score for the Pipeline is: 0.90